Today we will need the following packages:

library(tidyverse) #including ggplot2
library(ggbeeswarm)
library(ggResidpanel)

library(glmmTMB)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.4.1
## Current Matrix version is 1.5.1
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.8.1
## Current TMB version is 1.9.0
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
library(DHARMa)
library(emmeans)
library(car)

Reminder: we simulated those data, using a generative model corresponding to a binomial GLM:

set.seed(935)
number_observations <- 30
intercept <- 1
effect <- 2

predictor <- rnorm(n = number_observations,mean = 0,sd = 1)

latent <- intercept + effect*predictor
probability <- plogis(latent)
response <- sapply(probability, FUN=function(x){rbinom(1,1,x)})

data_binary <- tibble(predictor=predictor, response=response)
ggplot(data_binary, aes(x=predictor, y=response))+ geom_beeswarm(groupOnX = FALSE)

A linear model (in red) shows many pathologies, whereas a binomial GLM does what we want:

ggplot(data_binary, aes(x=predictor, y=response))+ geom_beeswarm(groupOnX = FALSE) +
  geom_smooth(method="lm", col="red", fill="red", fullrange=TRUE) + 
  geom_smooth(method="glm", col="blue", fill= "blue", method.args = list(family="binomial"), fullrange=TRUE)  +
  xlim(c(-4, 4))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

What about data generation?

So, a GLM predicts probabilities, values between 0 and 1. But we observe only 0s and 1s. Does the GLM knows about that? Yes it does! The GLM relates probabilities to data in a very obvious way: for a predicted probability \(p\), the GLM thinks you should observe a proportion \(p\) of 1s and a proportion \(1-p\) of 0s. If it seems trivial, it is because it is. GLMs for binary data are really easy.

Other GLMs use more complex processes, so let us look more formally at how a binary GLM sees the world.

The distribution turning a probability \(p\) into 0s and 1s is the bernoulli distribution, a special case of the binomial distribution. We can draw random samples from the bernoulli distribution with rbinom(n=, size=1, prob=)

(bernoulli_sample <- rbinom(n = 1000, size = 1, prob = 0.3))
##    [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1
##   [38] 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 1 0
##   [75] 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0
##  [112] 1 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 1
##  [149] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1
##  [186] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
##  [223] 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0
##  [260] 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0
##  [297] 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1
##  [334] 0 0 1 0 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 0
##  [371] 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1
##  [408] 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1
##  [445] 0 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 1 0 1 0 1 1 0 0
##  [482] 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1
##  [519] 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0
##  [556] 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0
##  [593] 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0
##  [630] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0
##  [667] 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1
##  [704] 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1
##  [741] 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
##  [778] 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0
##  [815] 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0
##  [852] 1 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0
##  [889] 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0
##  [926] 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 1 0 1 0 0
##  [963] 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 1 1 1 0 0 0 1 0
## [1000] 0
mean(bernoulli_sample)
## [1] 0.297

All GLMs use distributions that have some kind of relationship between their mean and their variance. For the bernoulli, the relationship is \(variance = mean*(1-mean)\), and the mean is expected to be equal to the probability.

var(bernoulli_sample)
## [1] 0.209
mean(bernoulli_sample)*(1-mean(bernoulli_sample))
## [1] 0.208791

That means that binary data are most variable for a probability of 0.5, and least variable for probabilities close to 0 or close to 1.

nb_rows <- 1000
variability_bernoulli <- tibble(lm_value= seq(-5,5, length.out = nb_rows),
       probability = plogis(lm_value),
       observation = rbinom(n = nb_rows, size = 1, prob = probability))

ggplot(variability_bernoulli, aes(x=lm_value, y=probability, col=probability)) +
  geom_line()+
  geom_point(inherit.aes = FALSE, aes(x=lm_value, y=observation))

Let’s simulate survival for each sex based on the predicted survival probabilities:

pred_survival <- summary(emmeans(vole_s_glm, ~sex, type = "response"))

simulated_survival <- pred_survival %>%  
  group_by(sex) %>%
  select(prob) %>%
  summarise(survival = rbinom(n = 100, size = 1, prob = prob), prob=prob)
## Adding missing grouping variables: `sex`
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
simulated_survival %>% group_by(sex) %>%
  summarise(simulated_mean = mean(survival), expected_mean=mean(prob),
            simulated_var = var(survival), expected_var=mean(prob)*(1-mean(prob)))
## # A tibble: 2 × 5
##   sex    simulated_mean expected_mean simulated_var expected_var
##   <fct>           <dbl>         <dbl>         <dbl>        <dbl>
## 1 Female           0.31         0.278         0.216        0.201
## 2 Male             0.14         0.160         0.122        0.134

What a GLM is

From previous examples we saw what a GLM is:

  1. A linear model (response = intercept + slope × predictor . . . ), what you see with summary(glm1)
  2. A “Link function” = a map between the linear function (−∞ to +∞) and a probability distribution (from 0 to 1 for bernoulli)
  3. A probability distribution (bernoulli, Binomial, Poisson. . . ) assumed to generate the data (either 0 or 1 for bernoulli)

glm scales

\[ y_i = \alpha + \beta x_i \\ p_i = \mathrm{logit}^{-1}(y_i) \\ \mathrm{obs}_i \sim Bernoulli(p_i) \]

More practice with snow voles

Let’s consider the relationship between mass and survival.

ggplot(survdat_early, aes(x = mass, y=survival)) +
  geom_point(alpha=0.2)
## Warning: Removed 13 rows containing missing values (geom_point).

Difficult to see much on this plot. Better to add some jitter/beeswarm and a glm fit:

ggplot(survdat_early, aes(x = mass, y=survival)) +
  geom_beeswarm(groupOnX = FALSE) +
  geom_smooth(method = "glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 13 rows containing non-finite values (stat_smooth).
## Warning: Removed 13 rows containing missing values (position_beeswarm).

vole_glm <- glm(survival ~ mass, data = survdat_early)
summary(vole_glm)
## 
## Call:
## glm(formula = survival ~ mass, data = survdat_early)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2954  -0.2466  -0.2015  -0.1490   0.8736  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.336739   0.045189   7.452 2.41e-13 ***
## mass        -0.003755   0.001403  -2.677  0.00759 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1718025)
## 
##     Null deviance: 137.64  on 795  degrees of freedom
## Residual deviance: 136.41  on 794  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 860.87
## 
## Number of Fisher Scoring iterations: 2

So it looks like higher body mass corresponds to lower survival across the population.

plot(simulateResiduals(vole_glm))

testResiduals(vole_glm)

## $uniformity
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  simulationOutput$scaledResiduals
## D = 0.37708, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## 
## $dispersion
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.99462, p-value = 0.976
## alternative hypothesis: two.sided
## 
## 
## $outliers
## 
##  DHARMa outlier test based on exact binomial test with approximate
##  expectations
## 
## data:  simulationOutput
## outliers at both margin(s) = 1, observations = 796, p-value = 0.02573
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  3.180579e-05 6.979485e-03
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                            0.001256281
## $uniformity
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  simulationOutput$scaledResiduals
## D = 0.37708, p-value < 2.2e-16
## alternative hypothesis: two-sided
## 
## 
## $dispersion
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.99462, p-value = 0.976
## alternative hypothesis: two.sided
## 
## 
## $outliers
## 
##  DHARMa outlier test based on exact binomial test with approximate
##  expectations
## 
## data:  simulationOutput
## outliers at both margin(s) = 1, observations = 796, p-value = 0.02573
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  3.180579e-05 6.979485e-03
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                            0.001256281
DHARMa::testQuantiles(simulateResiduals(vole_glm, n = 10000))

## 
##  Test for location of quantiles via qgam
## 
## data:  simulationOutput
## p-value < 2.2e-16
## alternative hypothesis: both

However, we should realize that the negative relationship is a consequence of sex-age structure:

ggplot(survdat_early, aes(x = mass, y=survival)) +
  geom_beeswarm(groupOnX = FALSE, aes(col=interaction(sex, age))) + 
  geom_smooth(method = "glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 13 rows containing non-finite values (stat_smooth).
## Warning: Removed 13 rows containing missing values (position_beeswarm).

We need sex and age in our models if we want to estimate the effect of mass independent of sex and age, which have rather trivial effects on survival (these animals have short life spans; adults senesce quickly and tend to get sick before their second winter; we do not think this has much to do with body mass; also, males get into nasty fights, especially when they become adults and can die from injuries.)

Let’s ask ggplot to show us the effect of mass, sex and age together:

ggplot(survdat_early, aes(x=mass, y= survival, color=interaction(sex,age)))+
  geom_beeswarm(groupOnX = FALSE) + 
  geom_smooth(method = "glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 13 rows containing non-finite values (stat_smooth).
## Warning: Removed 13 rows containing missing values (position_beeswarm).

That plot corresponds to the model:

summary(m_vole_interaction <- glm(survival ~ 1 + mass * sex*age,
                 data=survdat_early, family = "binomial"))
## 
## Call:
## glm(formula = survival ~ 1 + mass * sex * age, family = "binomial", 
##     data = survdat_early)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4357  -0.6466  -0.5820  -0.4735   2.1753  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)       -2.102756   1.434075  -1.466    0.143
## mass               0.009589   0.034936   0.274    0.784
## sexMale           -1.840155   2.810890  -0.655    0.513
## ageJ              -0.218154   1.607431  -0.136    0.892
## mass:sexMale       0.035693   0.063822   0.559    0.576
## mass:ageJ          0.069063   0.046196   1.495    0.135
## sexMale:ageJ       2.080420   3.003920   0.693    0.489
## mass:sexMale:ageJ -0.091311   0.077409  -1.180    0.238
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 843.57  on 795  degrees of freedom
## Residual deviance: 789.62  on 788  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 805.62
## 
## Number of Fisher Scoring iterations: 4
car::Anova(m_vole_interaction, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: survival
##              LR Chisq Df Pr(>Chisq)
## mass          0.07491  1     0.7843
## sex           0.43188  1     0.5111
## age           0.01843  1     0.8920
## mass:sex      0.31293  1     0.5759
## mass:age      2.28091  1     0.1310
## sex:age       0.48287  1     0.4871
## mass:sex:age  1.39295  1     0.2379

That may be an interesting model, but it is not really what we are after. We were trying to ask What is the effect of mass on survival, as a way to quantify natural selection overall, not within sex-age classes.

So, a better model is:

summary(final_model <- glm(survival ~ 1 + mass + sex*age,
                 data=survdat_early, family = "binomial"))
## 
## Call:
## glm(formula = survival ~ 1 + mass + sex * age, family = "binomial", 
##     data = survdat_early)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2124  -0.6698  -0.5707  -0.4742   2.1616  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.39598    0.74099  -4.583 4.58e-06 ***
## mass          0.04113    0.01723   2.387   0.0170 *  
## sexMale      -0.36042    0.34853  -1.034   0.3011    
## ageJ          1.95618    0.39468   4.956 7.18e-07 ***
## sexMale:ageJ -0.71355    0.40679  -1.754   0.0794 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 843.57  on 795  degrees of freedom
## Residual deviance: 792.36  on 791  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 802.36
## 
## Number of Fisher Scoring iterations: 4
car::Anova(final_model, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: survival
##         LR Chisq Df Pr(>Chisq)    
## mass      5.7211  1    0.01676 *  
## sex       1.0954  1    0.29528    
## age      26.3603  1  2.833e-07 ***
## sex:age   2.9827  1    0.08416 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(simulateResiduals(final_model))

testResiduals(final_model)

## $uniformity
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  simulationOutput$scaledResiduals
## D = 0.028785, p-value = 0.5246
## alternative hypothesis: two-sided
## 
## 
## $dispersion
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.99935, p-value = 0.984
## alternative hypothesis: two.sided
## 
## 
## $outliers
## 
##  DHARMa outlier test based on exact binomial test with approximate
##  expectations
## 
## data:  simulationOutput
## outliers at both margin(s) = 3, observations = 796, p-value = 0.2308
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  0.0007779017 0.0109743246
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                            0.003768844
## $uniformity
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  simulationOutput$scaledResiduals
## D = 0.028785, p-value = 0.5246
## alternative hypothesis: two-sided
## 
## 
## $dispersion
## 
##  DHARMa nonparametric dispersion test via sd of residuals fitted vs.
##  simulated
## 
## data:  simulationOutput
## dispersion = 0.99935, p-value = 0.984
## alternative hypothesis: two.sided
## 
## 
## $outliers
## 
##  DHARMa outlier test based on exact binomial test with approximate
##  expectations
## 
## data:  simulationOutput
## outliers at both margin(s) = 3, observations = 796, p-value = 0.2308
## alternative hypothesis: true probability of success is not equal to 0.007968127
## 95 percent confidence interval:
##  0.0007779017 0.0109743246
## sample estimates:
## frequency of outliers (expected: 0.00796812749003984 ) 
##                                            0.003768844
DHARMa::testQuantiles(simulateResiduals(final_model, n = 10000))

## 
##  Test for location of quantiles via qgam
## 
## data:  simulationOutput
## p-value = 0.9911
## alternative hypothesis: both

From that model we conclude that mass is positively selected in this vole population.

Let’s visualize the prediction from that model for each age-sex class:

fullpredictions <- as_tibble(emmeans(final_model, ~age*sex + mass,
                         at = list(mass = seq(min(survdat_early$mass, na.rm = TRUE),
                                              max(survdat_early$mass, na.rm = TRUE),
                                              length.out = 100)), type="response"))

ggplot(survdat_early, aes(x=mass, y= survival, color=interaction(sex,age)))+
  geom_beeswarm(groupOnX = FALSE)+
  geom_line(data = fullpredictions, aes(x=mass, y=prob))
## Warning: Removed 13 rows containing missing values (position_beeswarm).

We can add confidence intervals:

ggplot(survdat_early, aes(x=mass, y= survival, color=interaction(sex,age)))+
  geom_beeswarm(groupOnX = FALSE)+
  geom_line(data = fullpredictions, aes(x=mass, y=prob))+
  geom_ribbon(data = fullpredictions, inherit.aes = FALSE, alpha=0.1,
              aes(x=mass, ymin=asymp.LCL, ymax=asymp.UCL, fill=interaction(sex,age)))
## Warning: Removed 13 rows containing missing values (position_beeswarm).

Note that the juvenile and adult mass are almost non-overlapping distributions. So the predictions are extrapolating into regions where there are no voles in the data. This may be problematic. Do you think that if we could fatten juvenile females up to 60 grams their survival probability would be 75% ?

We should probably show predictions only in the range where data exist:

massextr <- survdat_early %>% 
  group_by(sex, age) %>%
  summarise(minmass = min(mass, na.rm = TRUE), maxmass=max(mass, na.rm = TRUE))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
filteredpredictions <- fullpredictions %>% 
        filter((sex=="Female" & age=="A" & mass >= massextr$minmass[1] & mass <=massextr$maxmass[1]) |
              (sex=="Female" & age=="J" & mass >= massextr$minmass[2] & mass <=massextr$maxmass[2]) |
              (sex=="Male" & age=="A" & mass >= massextr$minmass[3] & mass <=massextr$maxmass[3]) |
              (sex=="Male" & age=="J" & mass >= massextr$minmass[4] & mass <=massextr$maxmass[4]))

ggplot(survdat_early, aes(x=mass, y= survival, color=interaction(sex,age)))+
  geom_beeswarm(groupOnX = FALSE) +
  geom_line(data=filteredpredictions, aes(y = prob)) +
  geom_ribbon(data=filteredpredictions, aes(x=mass, ymin=asymp.LCL, ymax=asymp.UCL, fill=interaction(sex,age)),
  inherit.aes = FALSE, alpha=0.3)
## Warning: Removed 13 rows containing missing values (position_beeswarm).